package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.DoMainUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

/**
 * 商都网解析
 * @author Persh
 *
 */
public class ShangDuNewsAnalyse implements AnalyseNews {

	private static final String[] regex={
		"http://.*.shangdu.com/[0-9]*/[0-9]{8}/[0-9]*_[0-9]*.shtml",
		"http://.*.shangdu.com/.*/[0-9]{4}-[0-9]{2}-[0-9]{2}/[0-9]*.html",
		"http://.*.shangdu.com/industry-topics/[0-9]{6}/[0-9]*/[0-9]*.html",
		"http://.*.shangdu.com/.*/[0-9]{8}/[0-9]*_[0-9]*.shtml",
		"http://auto.shangdu.com/.*/[0-9]*.html",
		"http://.*.shangdu.com/[0-9]{4}/.*/[0-9]*.html",
		"http://.*.shangdu.com/.*/[0-9]*-[0-9]*.html",
		"http://health.shangdu.com/.*/[0-9]*.html",
		"http://.*.shangdu.com/.*/[0-9]{8}/[0-9]*_[0-9]*.html",
		"http://it.shangdu.com/[0-9]*/.*.shtml",
		"http://.*.shangdu.com/.*/[0-9]*/[0-9]*.html",
		"http://law.shangdu.com/.*/.*.asp[?]id=[0-9]*&.*",
		"http://people.shangdu.com/.*.html",
		"http://.*.shangdu.com/.*/[0-9]*-[0-9]*.shtml"};

	
	public boolean isDetailPage(String url) {
		if (url.startsWith("http://edu.shangdu.com")){
			if (url.matches("http://edu.shangdu.com/Article/.*/[0-9]+/[0-9]{6,}.html")){
				return true;
			}else{
				return false;
			}
		}
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return false;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();
		if(!isDetailPage(url)){
		}
		news.setUrl(url);
		String title=null;
		String content=null;
		Long date=null;
		Document doc=Jsoup.parse(htmltxt);

		doc.select(".header").remove();
		doc.select(".subNav").remove();
		doc.select("#xx28").remove();
		doc.select("span[onclick]").remove();
			title =doc.select("h1#h1title").text();
			if(title==null || title.trim().length()==0){
				title =doc.select("div.newsColumnHeader>h1").text();
				if(title==null || title.trim().length()==0){
					title =doc.select("div.news-left-border>div.hd>h1").text();
					if(title==null || title.trim().length()==0){
						title =doc.select("div.artTit>h1").text();
						if(title==null || title.trim().length()==0){
							title =doc.select("div.hd>h1").text();
							if(title==null || title.trim().length()==0){
								title =doc.select("div.left>h1").text();
								if(title==null || title.trim().length()==0){
									title =doc.select("div.h>h1.ph").text();
									if(title==null || title.trim().length()==0){
										title =doc.select("div.top1title>h2").text();
										if(title==null || title.trim().length()==0){
											title =doc.select("div.totle>h1").text();
											if(title==null || title.trim().length()==0){
												title =doc.select("div.newsD_title>h2").text();
												if(title==null || title.trim().length()==0){
													title =doc.select("div#title").text();
													if(title==null || title.trim().length()==0){
														title =doc.select("div.title").text();
														if(title==null || title.trim().length()==0){
															title =doc.select("h1.nry_wzb").text();
															if(title==null || title.trim().length()==0){
																title =doc.select("div#tb41>span").text();
																if(title==null || title.trim().length()==0){
																	title =doc.select("div.cuse").text();
																	if(title==null || title.trim().length()==0){
																		title =doc.select("h1#Tittle").text();
																	}
																}
															}
														}
													}
												}
											}
										}
									}
								}
							}
						}
					}
				}
			}
			content=HtmlCleaner.getContentHtml(url,doc.select("div#newsColumn>div.newsContent"));
			if(content==null || content.trim().length()==0){
				content=HtmlCleaner.getContentHtml(url,doc.select("div#matter.newstext>p"));
				if(content==null || content.trim().length()==0){
					content=HtmlCleaner.getContentHtml(url,doc.select("div#Article"));
					if(content==null || content.trim().length()==0){
						content=HtmlCleaner.getContentHtml(url,doc.select("div.newsContent"));
						if(content==null || content.trim().length()==0){
							content=HtmlCleaner.getContentHtml(url,doc.select("div.nzw"));
							if(content==null || content.trim().length()==0){
								content=HtmlCleaner.getContentHtml(url,doc.select("div.news_text"));
								if(content==null || content.trim().length()==0){
									content=HtmlCleaner.getContentHtml(url,doc.select("td#article_content"));
									if(content==null || content.trim().length()==0){
										content=HtmlCleaner.getContentHtml(url,doc.select("div#content"));
										if(content==null || content.trim().length()==0){
											content=HtmlCleaner.getContentHtml(url,doc.select("div.txt_zhengwen"));
											if(content==null || content.trim().length()==0){
												content=HtmlCleaner.getContentHtml(url,doc.select("div.newsD_con"));
												if(content==null || content.trim().length()==0){
													content=HtmlCleaner.getContentHtml(url,doc.select("div.wds"));
													if(content==null || content.trim().length()==0){
														content=HtmlCleaner.getContentHtml(url,doc.select("div#zk_nrt"));
														if(content==null || content.trim().length()==0){
															content=HtmlCleaner.getContentHtml(url,doc.select("div.nry_zw"));
															if(content==null || content.trim().length()==0){
																content=HtmlCleaner.getContentHtml(url,doc.select("div#manadona"));
																if(content==null || content.trim().length()==0){
																	content = HtmlCleaner.getContentHtml(url, doc.select("div.z_text"));
																	if(content==null || content.trim().length()==0){
																		content=HtmlCleaner.getContentHtml(url,doc.select("div.content"));
																		if(content==null || content.trim().length()==0){
																			content=HtmlCleaner.getContentHtml(url,doc.select("td.c"));
																			if(content==null || content.trim().length()==0){
																				content=HtmlCleaner.getContentHtml(url,doc.select("div#txt_zhengwen"));
																				if(content==null || content.trim().length()==0){
																					content=HtmlCleaner.getContentHtml(url,doc.select("td#text"));
																				}
																			}
																		}
																	
																	}
																}
															}
														}
													}
												}
											}
										}
									}
								}
							}
						}
					}
				}
			}

			String tempdate=doc.select("div.newsColumnHeader>div.artInfo").text();
			if(tempdate==null || tempdate.trim().length()==0){
				tempdate=doc.select("div.newsColumnHeader").text();
				if(tempdate==null || tempdate.trim().length()==0){
					tempdate=doc.select("div.titbar>span").text();
					if(tempdate==null || tempdate.trim().length()==0){
						tempdate=doc.select("div.artAbout").text();
						if(tempdate==null || tempdate.trim().length()==0){
							tempdate=doc.select("div.info").text();
							if(tempdate==null || tempdate.trim().length()==0){
								tempdate=doc.select("span.news_op01").text();
								if(tempdate==null || tempdate.trim().length()==0){
									tempdate=doc.select("div.bm>div.h>p.xg1").text();
									if(tempdate==null || tempdate.trim().length()==0){
										tempdate=doc.select("div.top1title>dl.a1>span").text();
										if(tempdate==null || tempdate.trim().length()==0){
											tempdate=doc.select("div.totle2>span.orange").text();
											if(tempdate==null || tempdate.trim().length()==0){
												tempdate=doc.select("div.newsD_time").text();
												if(tempdate==null || tempdate.trim().length()==0){
													tempdate=doc.select("div#main>div.time>span").text();
													if(tempdate==null || tempdate.trim().length()==0){
														tempdate=doc.select("div.time>div>span").text();
														if(tempdate==null || tempdate.trim().length()==0){
															tempdate=doc.select("span.zecc").text();
															if(tempdate==null || tempdate.trim().length()==0){
																tempdate=doc.select("div#xx21").text();
																if(tempdate==null || tempdate.trim().length()==0){
																	tempdate=doc.select("html>body>table>tr>td>table>tr>td>table>tr>td>div>div").text();
																	if(tempdate==null || tempdate.trim().length()==0){
																		tempdate=doc.select("span.orange").text();
																	}
																}
															}
														}
													}
												}
											}
										}
									}
								}
							}
						}
					}
				}
			}


			//网站本身时间不对，所以优先从url里面提取
			date=DateUtils.matchDate(tempdate);

			String author= JSoupUtils.matchAuthor(doc, "来源：");
			if (date == null ) {
				date=JSoupUtils.matchDate(doc, "来源：");
			}

			if (date == null ) {
				date=DateUtils.matchDate(url);
			}
			news.setTitle(StringUtils.trimSpace(title));
			news.setContent(content);
			news.setDate(date);

			news.setAuthor(author);
		return news;
	}

	public static void main(String[] args) {
		ShangDuNewsAnalyse analyse=new ShangDuNewsAnalyse();
		String url="http://health.shangdu.com/xinwen/yihe/2015/61953.html";
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(analyse.parserHtml(meta));
		System.out.println(DoMainUtils.GetDomainName(url));
	}

	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}

	
	public boolean isNeedUpdate(){
		return false;
	}
}
